{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "```\n",
    "python -m pip install \"dask[complete]\"\n",
    "```"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from sklearn.datasets import make_classification\n",
    "import glob\n",
    "import dask.dataframe as dd\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "创建 test_1.csv\n",
      "创建 test_2.csv\n",
      "创建 test_3.csv\n",
      "创建 test_4.csv\n",
      "创建 test_5.csv\n",
      "创建 test_6.csv\n",
      "创建 test_7.csv\n",
      "创建 test_8.csv\n",
      "创建 test_9.csv\n",
      "创建 test_10.csv\n"
     ]
    }
   ],
   "source": [
    "for i in range(1, 11):\n",
    "    print('创建 test_%d.csv' % i)\n",
    "    x, y = make_classification(n_samples=100_000, n_features=100)\n",
    "    df = pd.DataFrame(data=x)\n",
    "    df['y'] = y\n",
    "    df.to_csv('./data/dask_data/test_%d.csv' % i, index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 20.4 s, sys: 3.94 s, total: 24.3 s\n",
      "Wall time: 24.3 s\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "(1000000, 101)"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "df_list = []\n",
    "for file in glob.glob('./data/dask_data/test_*.csv'):\n",
    "    df_ = pd.read_csv(file)\n",
    "    df_list.append(df_)\n",
    "df = pd.concat(df_list)\n",
    "df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Int64Index: 1000000 entries, 0 to 99999\n",
      "Columns: 101 entries, 0 to y\n",
      "dtypes: float64(100), int64(1)\n",
      "memory usage: 778.2 MB\n"
     ]
    }
   ],
   "source": [
    "df.info(memory_usage='deep')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "      <th>5</th>\n",
       "      <th>6</th>\n",
       "      <th>7</th>\n",
       "      <th>8</th>\n",
       "      <th>9</th>\n",
       "      <th>...</th>\n",
       "      <th>91</th>\n",
       "      <th>92</th>\n",
       "      <th>93</th>\n",
       "      <th>94</th>\n",
       "      <th>95</th>\n",
       "      <th>96</th>\n",
       "      <th>97</th>\n",
       "      <th>98</th>\n",
       "      <th>99</th>\n",
       "      <th>y</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1.828721</td>\n",
       "      <td>-2.114304</td>\n",
       "      <td>-0.905244</td>\n",
       "      <td>-0.259237</td>\n",
       "      <td>1.412799</td>\n",
       "      <td>-0.067901</td>\n",
       "      <td>1.286529</td>\n",
       "      <td>-0.882725</td>\n",
       "      <td>-0.847326</td>\n",
       "      <td>0.400505</td>\n",
       "      <td>...</td>\n",
       "      <td>-2.879399</td>\n",
       "      <td>-0.231125</td>\n",
       "      <td>1.048071</td>\n",
       "      <td>-1.973970</td>\n",
       "      <td>0.334500</td>\n",
       "      <td>-1.386493</td>\n",
       "      <td>0.696828</td>\n",
       "      <td>0.092615</td>\n",
       "      <td>0.565590</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1.383999</td>\n",
       "      <td>0.824542</td>\n",
       "      <td>0.339164</td>\n",
       "      <td>-1.430962</td>\n",
       "      <td>-0.601582</td>\n",
       "      <td>-0.239777</td>\n",
       "      <td>0.385469</td>\n",
       "      <td>-0.321220</td>\n",
       "      <td>-0.892520</td>\n",
       "      <td>1.457037</td>\n",
       "      <td>...</td>\n",
       "      <td>1.316623</td>\n",
       "      <td>-0.593951</td>\n",
       "      <td>0.367487</td>\n",
       "      <td>0.555137</td>\n",
       "      <td>0.958412</td>\n",
       "      <td>-0.106074</td>\n",
       "      <td>0.268736</td>\n",
       "      <td>0.820467</td>\n",
       "      <td>1.299332</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>-1.349638</td>\n",
       "      <td>-0.363348</td>\n",
       "      <td>0.127111</td>\n",
       "      <td>-0.442155</td>\n",
       "      <td>0.320757</td>\n",
       "      <td>0.170465</td>\n",
       "      <td>-1.235266</td>\n",
       "      <td>0.518046</td>\n",
       "      <td>-2.468218</td>\n",
       "      <td>0.639260</td>\n",
       "      <td>...</td>\n",
       "      <td>-0.704430</td>\n",
       "      <td>-0.987114</td>\n",
       "      <td>0.722610</td>\n",
       "      <td>-0.128892</td>\n",
       "      <td>0.271635</td>\n",
       "      <td>-1.149973</td>\n",
       "      <td>0.834952</td>\n",
       "      <td>-0.249592</td>\n",
       "      <td>-0.554042</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>-1.043174</td>\n",
       "      <td>-0.468837</td>\n",
       "      <td>1.994105</td>\n",
       "      <td>-1.707017</td>\n",
       "      <td>1.663338</td>\n",
       "      <td>-0.060764</td>\n",
       "      <td>1.509859</td>\n",
       "      <td>1.036977</td>\n",
       "      <td>0.694708</td>\n",
       "      <td>-1.354223</td>\n",
       "      <td>...</td>\n",
       "      <td>0.871424</td>\n",
       "      <td>1.516713</td>\n",
       "      <td>1.866168</td>\n",
       "      <td>-0.690004</td>\n",
       "      <td>1.341017</td>\n",
       "      <td>0.104675</td>\n",
       "      <td>-1.133161</td>\n",
       "      <td>0.581784</td>\n",
       "      <td>0.535151</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1.322279</td>\n",
       "      <td>-0.850668</td>\n",
       "      <td>0.710705</td>\n",
       "      <td>-0.991021</td>\n",
       "      <td>-1.226483</td>\n",
       "      <td>-0.521772</td>\n",
       "      <td>0.249594</td>\n",
       "      <td>0.164892</td>\n",
       "      <td>-0.682996</td>\n",
       "      <td>1.273771</td>\n",
       "      <td>...</td>\n",
       "      <td>0.868495</td>\n",
       "      <td>-0.780112</td>\n",
       "      <td>-0.016639</td>\n",
       "      <td>-0.241537</td>\n",
       "      <td>0.616442</td>\n",
       "      <td>0.188719</td>\n",
       "      <td>0.010123</td>\n",
       "      <td>-0.906872</td>\n",
       "      <td>0.007898</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 101 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "          0         1         2         3         4         5         6  \\\n",
       "0  1.828721 -2.114304 -0.905244 -0.259237  1.412799 -0.067901  1.286529   \n",
       "1  1.383999  0.824542  0.339164 -1.430962 -0.601582 -0.239777  0.385469   \n",
       "2 -1.349638 -0.363348  0.127111 -0.442155  0.320757  0.170465 -1.235266   \n",
       "3 -1.043174 -0.468837  1.994105 -1.707017  1.663338 -0.060764  1.509859   \n",
       "4  1.322279 -0.850668  0.710705 -0.991021 -1.226483 -0.521772  0.249594   \n",
       "\n",
       "          7         8         9  ...        91        92        93        94  \\\n",
       "0 -0.882725 -0.847326  0.400505  ... -2.879399 -0.231125  1.048071 -1.973970   \n",
       "1 -0.321220 -0.892520  1.457037  ...  1.316623 -0.593951  0.367487  0.555137   \n",
       "2  0.518046 -2.468218  0.639260  ... -0.704430 -0.987114  0.722610 -0.128892   \n",
       "3  1.036977  0.694708 -1.354223  ...  0.871424  1.516713  1.866168 -0.690004   \n",
       "4  0.164892 -0.682996  1.273771  ...  0.868495 -0.780112 -0.016639 -0.241537   \n",
       "\n",
       "         95        96        97        98        99  y  \n",
       "0  0.334500 -1.386493  0.696828  0.092615  0.565590  0  \n",
       "1  0.958412 -0.106074  0.268736  0.820467  1.299332  0  \n",
       "2  0.271635 -1.149973  0.834952 -0.249592 -0.554042  1  \n",
       "3  1.341017  0.104675 -1.133161  0.581784  0.535151  0  \n",
       "4  0.616442  0.188719  0.010123 -0.906872  0.007898  0  \n",
       "\n",
       "[5 rows x 101 columns]"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 144 ms, sys: 182 µs, total: 144 ms\n",
      "Wall time: 142 ms\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "df = dd.read_csv('./data/dask_data/test_*.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'dask.dataframe.core.DataFrame'>\n",
      "Columns: 101 entries, 0 to y\n",
      "dtypes: float64(100), int64(1)\n",
      "memory usage: 770.6 MB\n"
     ]
    }
   ],
   "source": [
    "df.info(memory_usage='deep')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 35.4 s, sys: 3.52 s, total: 38.9 s\n",
      "Wall time: 8.11 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "df = dd.read_csv('./data/dask_data/test_*.csv').compute()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Int64Index: 1000000 entries, 0 to 2291\n",
      "Columns: 101 entries, 0 to y\n",
      "dtypes: float64(100), int64(1)\n",
      "memory usage: 778.2 MB\n"
     ]
    }
   ],
   "source": [
    "df.info(memory_usage='deep')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#!sudo apt-get install graphviz"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "exec_graph = dd.read_csv('./data/dask_data/test_*.csv')\n",
    "exec_graph.visualize(filename='exec_graph.pdf')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
